In [220]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999



In [278]:

    
def draw_scatter_plot(df, col_name):
    np_array = df[col_name].values
    
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
    
def draw_dist_plot(df, col_name):
    np_array = df[col_name].values
    
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.show()
    
    
def draw_count_plot(df, col_name, title='plot'):
    plt.figure(figsize=(12,8))
    sns.countplot(data=df, x=col_name)
    plt.xticks(rotation='vertical')
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.title(title, fontsize=15)
    plt.show()
    
    
def draw_box_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.boxplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    
    
def draw_violin_plot(df, x_col, y_col):
    plt.figure(figsize=(12,8))
    sns.violinplot(data=df, x=x_col, y=y_col)
    plt.xlabel(x_col, fontsize=12)
    plt.ylabel(y_col, fontsize=12)
    plt.show()
    

def draw_plots(df, col_name):
    draw_scatter_plot(df, col_name)
    draw_dist_plot(df, col_name)

    
def draw_np_array_scatter_plot(np_array, col_name):
    plt.figure(figsize=(8,6))
    plt.scatter(range(len(np_array)), np.sort(np_array))
    plt.xlabel('index', fontsize=12)
    plt.ylabel(col_name, fontsize=12)
    plt.show()
    
    
def draw_np_array_dist_plot(np_array, col_name):
    plt.figure(figsize=(12,8))
    sns.distplot(np_array, bins=50, kde=False)
    plt.xlabel(col_name, fontsize=12)
    plt.ylabel('count', fontsize=12)
    plt.show()
    

def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
    np_array = df[col_name].values
    
    ulimit = np.percentile(np_array, upper_percentile)
    llimit = np.percentile(np_array, lower_percentile)
    print('upper limit :', ulimit, ', lower limit :', llimit)
    
    # convert
    df[col_name].loc[df[col_name] > ulimit] = ulimit
    df[col_name].loc[df[col_name] < llimit] = llimit



In [222]:

    
from subprocess import check_output
print(check_output(['ls', 'input']).decode('utf8'))









    



properties_2016.csv
sample_submission.csv
train_2016_v2.csv
zillow_data_dictionary.xlsx



In [223]:

    
train_df = pd.read_csv('input/train_2016_v2.csv', parse_dates=['transactiondate'])
print('train_df.shape :', train_df.shape)
display(train_df.head())









    



train_df.shape : (90275, 3)






    







  
    
      
      parcelid
      logerror
      transactiondate
    
  
  
    
      0
      11016594
      0.0276
      2016-01-01
    
    
      1
      14366692
      -0.1684
      2016-01-01
    
    
      2
      12098116
      -0.0040
      2016-01-01
    
    
      3
      12643413
      0.0218
      2016-01-02
    
    
      4
      14432541
      -0.0050
      2016-01-02



In [224]:

    
draw_plots(train_df, 'logerror')



In [225]:

    
convert_outlier_value(train_df, 'logerror')









    



upper limit : 0.463882 , lower limit : -0.3425



In [226]:

    
draw_plots(train_df, 'logerror')



In [227]:

    
train_df['transaction_month'] = train_df['transactiondate'].dt.month
display(train_df.head())









    







  
    
      
      parcelid
      logerror
      transactiondate
      transaction_month
    
  
  
    
      0
      11016594
      0.0276
      2016-01-01
      1
    
    
      1
      14366692
      -0.1684
      2016-01-01
      1
    
    
      2
      12098116
      -0.0040
      2016-01-01
      1
    
    
      3
      12643413
      0.0218
      2016-01-02
      1
    
    
      4
      14432541
      -0.0050
      2016-01-02
      1



In [228]:

    
transactiondate_cnt_srs = train_df['transactiondate'].value_counts()

plt.figure(figsize=(8,6))
plt.scatter(range(len(transactiondate_cnt_srs)), np.sort(transactiondate_cnt_srs))
plt.xlabel('transactiondate', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.show()



In [229]:

    
cnt_srs = train_df['transaction_month'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color=color[3])
plt.xlabel('Month of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()



In [230]:

    
(train_df['parcelid'].value_counts().reset_index())['parcelid'].value_counts()









    Out[230]:





1    90026
2      123
3        1
Name: parcelid, dtype: int64



In [231]:

    
prop_df = pd.read_csv('input/properties_2016.csv', low_memory=False)
print('properties 2016 shape :', prop_df.shape)
display(prop_df.head())









    



properties 2016 shape : (2985217, 58)






    







  
    
      
      parcelid
      airconditioningtypeid
      architecturalstyletypeid
      basementsqft
      bathroomcnt
      bedroomcnt
      buildingclasstypeid
      buildingqualitytypeid
      calculatedbathnbr
      decktypeid
      finishedfloor1squarefeet
      calculatedfinishedsquarefeet
      finishedsquarefeet12
      finishedsquarefeet13
      finishedsquarefeet15
      finishedsquarefeet50
      finishedsquarefeet6
      fips
      fireplacecnt
      fullbathcnt
      garagecarcnt
      garagetotalsqft
      hashottuborspa
      heatingorsystemtypeid
      latitude
      longitude
      lotsizesquarefeet
      poolcnt
      poolsizesum
      pooltypeid10
      pooltypeid2
      pooltypeid7
      propertycountylandusecode
      propertylandusetypeid
      propertyzoningdesc
      rawcensustractandblock
      regionidcity
      regionidcounty
      regionidneighborhood
      regionidzip
      roomcnt
      storytypeid
      threequarterbathnbr
      typeconstructiontypeid
      unitcnt
      yardbuildingsqft17
      yardbuildingsqft26
      yearbuilt
      numberofstories
      fireplaceflag
      structuretaxvaluedollarcnt
      taxvaluedollarcnt
      assessmentyear
      landtaxvaluedollarcnt
      taxamount
      taxdelinquencyflag
      taxdelinquencyyear
      censustractandblock
    
  
  
    
      0
      10754147
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6037.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      34144442.0
      -118654084.0
      85768.0
      NaN
      NaN
      NaN
      NaN
      NaN
      010D
      269.0
      NaN
      6.037800e+07
      37688.0
      3101.0
      NaN
      96337.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      9.0
      2015.0
      9.0
      NaN
      NaN
      NaN
      NaN
    
    
      1
      10759547
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      6037.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      34140430.0
      -118625364.0
      4083.0
      NaN
      NaN
      NaN
      NaN
      NaN
      0109
      261.0
      LCA11*
      6.037800e+07
      37688.0
      3101.0
      NaN
      96337.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      27516.0
      2015.0
      27516.0
      NaN
      NaN
      NaN
      NaN
    
    
      2
      10843547
      NaN
      NaN
      NaN
      0.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      73026.0
      NaN
      NaN
      73026.0
      NaN
      NaN
      6037.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      33989359.0
      -118394633.0
      63085.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1200
      47.0
      LAC2
      6.037703e+07
      51617.0
      3101.0
      NaN
      96095.0
      0.0
      NaN
      NaN
      NaN
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      650756.0
      1413387.0
      2015.0
      762631.0
      20800.37
      NaN
      NaN
      NaN
    
    
      3
      10859147
      NaN
      NaN
      NaN
      0.0
      0.0
      3.0
      7.0
      NaN
      NaN
      NaN
      5068.0
      NaN
      NaN
      5068.0
      NaN
      NaN
      6037.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      34148863.0
      -118437206.0
      7521.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1200
      47.0
      LAC2
      6.037141e+07
      12447.0
      3101.0
      27080.0
      96424.0
      0.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1948.0
      1.0
      NaN
      571346.0
      1156834.0
      2015.0
      585488.0
      14557.57
      NaN
      NaN
      NaN
    
    
      4
      10879947
      NaN
      NaN
      NaN
      0.0
      0.0
      4.0
      NaN
      NaN
      NaN
      NaN
      1776.0
      NaN
      NaN
      1776.0
      NaN
      NaN
      6037.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      34194168.0
      -118385816.0
      8512.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1210
      31.0
      LAM1
      6.037123e+07
      12447.0
      3101.0
      46795.0
      96450.0
      0.0
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      1947.0
      NaN
      NaN
      193796.0
      433491.0
      2015.0
      239695.0
      5725.17
      NaN
      NaN
      NaN



In [232]:

    
missing_df = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count'] > 0]
missing_df = missing_df.sort_values(by='missing_count')
display(missing_df)









    







  
    
      
      column_name
      missing_count
    
  
  
    
      35
      rawcensustractandblock
      11437
    
    
      25
      longitude
      11437
    
    
      24
      latitude
      11437
    
    
      37
      regionidcounty
      11437
    
    
      17
      fips
      11437
    
    
      33
      propertylandusetypeid
      11437
    
    
      52
      assessmentyear
      11439
    
    
      5
      bedroomcnt
      11450
    
    
      4
      bathroomcnt
      11462
    
    
      40
      roomcnt
      11475
    
    
      32
      propertycountylandusecode
      12277
    
    
      39
      regionidzip
      13980
    
    
      54
      taxamount
      31250
    
    
      51
      taxvaluedollarcnt
      42550
    
    
      50
      structuretaxvaluedollarcnt
      54982
    
    
      11
      calculatedfinishedsquarefeet
      55565
    
    
      47
      yearbuilt
      59928
    
    
      36
      regionidcity
      62845
    
    
      53
      landtaxvaluedollarcnt
      67733
    
    
      57
      censustractandblock
      75126
    
    
      19
      fullbathcnt
      128912
    
    
      8
      calculatedbathnbr
      128912
    
    
      12
      finishedsquarefeet12
      276033
    
    
      26
      lotsizesquarefeet
      276099
    
    
      34
      propertyzoningdesc
      1006588
    
    
      44
      unitcnt
      1007727
    
    
      7
      buildingqualitytypeid
      1046729
    
    
      23
      heatingorsystemtypeid
      1178816
    
    
      38
      regionidneighborhood
      1828815
    
    
      21
      garagetotalsqft
      2101950
    
    
      20
      garagecarcnt
      2101950
    
    
      1
      airconditioningtypeid
      2173698
    
    
      48
      numberofstories
      2303148
    
    
      27
      poolcnt
      2467683
    
    
      31
      pooltypeid7
      2499758
    
    
      18
      fireplacecnt
      2672580
    
    
      42
      threequarterbathnbr
      2673586
    
    
      10
      finishedfloor1squarefeet
      2782500
    
    
      15
      finishedsquarefeet50
      2782500
    
    
      14
      finishedsquarefeet15
      2794419
    
    
      45
      yardbuildingsqft17
      2904862
    
    
      22
      hashottuborspa
      2916203
    
    
      56
      taxdelinquencyyear
      2928753
    
    
      55
      taxdelinquencyflag
      2928755
    
    
      29
      pooltypeid10
      2948278
    
    
      30
      pooltypeid2
      2953142
    
    
      28
      poolsizesum
      2957257
    
    
      16
      finishedsquarefeet6
      2963216
    
    
      9
      decktypeid
      2968121
    
    
      6
      buildingclasstypeid
      2972588
    
    
      13
      finishedsquarefeet13
      2977545
    
    
      43
      typeconstructiontypeid
      2978470
    
    
      2
      architecturalstyletypeid
      2979156
    
    
      49
      fireplaceflag
      2980054
    
    
      46
      yardbuildingsqft26
      2982570
    
    
      3
      basementsqft
      2983589
    
    
      41
      storytypeid
      2983593



In [233]:

    
ind = np.arange(missing_df.shape[0])
fig, ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind, missing_df.missing_count.values, color='blue')
ax.set_yticklabels(missing_df.column_name.values)
plt.yticks(ind)
plt.xlabel('Count of missing values')
plt.title('Number of missing values is each column')
plt.show()



In [234]:

    
plt.figure(figsize=(12,12))
sns.jointplot('latitude', 'longitude', data=prop_df, size=10)
plt.show()









    





<matplotlib.figure.Figure at 0x7f644dd22ba8>



In [235]:

    
train_df = pd.merge(train_df, prop_df, on='parcelid', how='left')
train_df.head()









    Out[235]:







  
    
      
      parcelid
      logerror
      transactiondate
      transaction_month
      airconditioningtypeid
      architecturalstyletypeid
      basementsqft
      bathroomcnt
      bedroomcnt
      buildingclasstypeid
      buildingqualitytypeid
      calculatedbathnbr
      decktypeid
      finishedfloor1squarefeet
      calculatedfinishedsquarefeet
      finishedsquarefeet12
      finishedsquarefeet13
      finishedsquarefeet15
      finishedsquarefeet50
      finishedsquarefeet6
      fips
      fireplacecnt
      fullbathcnt
      garagecarcnt
      garagetotalsqft
      hashottuborspa
      heatingorsystemtypeid
      latitude
      longitude
      lotsizesquarefeet
      poolcnt
      poolsizesum
      pooltypeid10
      pooltypeid2
      pooltypeid7
      propertycountylandusecode
      propertylandusetypeid
      propertyzoningdesc
      rawcensustractandblock
      regionidcity
      regionidcounty
      regionidneighborhood
      regionidzip
      roomcnt
      storytypeid
      threequarterbathnbr
      typeconstructiontypeid
      unitcnt
      yardbuildingsqft17
      yardbuildingsqft26
      yearbuilt
      numberofstories
      fireplaceflag
      structuretaxvaluedollarcnt
      taxvaluedollarcnt
      assessmentyear
      landtaxvaluedollarcnt
      taxamount
      taxdelinquencyflag
      taxdelinquencyyear
      censustractandblock
    
  
  
    
      0
      11016594
      0.0276
      2016-01-01
      1
      1.0
      NaN
      NaN
      2.0
      3.0
      NaN
      4.0
      2.0
      NaN
      NaN
      1684.0
      1684.0
      NaN
      NaN
      NaN
      NaN
      6037.0
      NaN
      2.0
      NaN
      NaN
      NaN
      2.0
      34280990.0
      -118488536.0
      7528.0
      NaN
      NaN
      NaN
      NaN
      NaN
      0100
      261.0
      LARS
      6.037107e+07
      12447.0
      3101.0
      31817.0
      96370.0
      0.0
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      1959.0
      NaN
      NaN
      122754.0
      360170.0
      2015.0
      237416.0
      6735.88
      NaN
      NaN
      6.037107e+13
    
    
      1
      14366692
      -0.1684
      2016-01-01
      1
      NaN
      NaN
      NaN
      3.5
      4.0
      NaN
      NaN
      3.5
      NaN
      NaN
      2263.0
      2263.0
      NaN
      NaN
      NaN
      NaN
      6059.0
      NaN
      3.0
      2.0
      468.0
      NaN
      NaN
      33668120.0
      -117677556.0
      3643.0
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      261.0
      NaN
      6.059052e+07
      32380.0
      1286.0
      NaN
      96962.0
      0.0
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      2014.0
      NaN
      NaN
      346458.0
      585529.0
      2015.0
      239071.0
      10153.02
      NaN
      NaN
      NaN
    
    
      2
      12098116
      -0.0040
      2016-01-01
      1
      1.0
      NaN
      NaN
      3.0
      2.0
      NaN
      4.0
      3.0
      NaN
      NaN
      2217.0
      2217.0
      NaN
      NaN
      NaN
      NaN
      6037.0
      NaN
      3.0
      NaN
      NaN
      NaN
      2.0
      34136312.0
      -118175032.0
      11423.0
      NaN
      NaN
      NaN
      NaN
      NaN
      0100
      261.0
      PSR6
      6.037464e+07
      47019.0
      3101.0
      275411.0
      96293.0
      0.0
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      1940.0
      NaN
      NaN
      61994.0
      119906.0
      2015.0
      57912.0
      11484.48
      NaN
      NaN
      6.037464e+13
    
    
      3
      12643413
      0.0218
      2016-01-02
      1
      1.0
      NaN
      NaN
      2.0
      2.0
      NaN
      4.0
      2.0
      NaN
      NaN
      839.0
      839.0
      NaN
      NaN
      NaN
      NaN
      6037.0
      NaN
      2.0
      NaN
      NaN
      NaN
      2.0
      33755800.0
      -118309000.0
      70859.0
      NaN
      NaN
      NaN
      NaN
      NaN
      010C
      266.0
      LAR3
      6.037296e+07
      12447.0
      3101.0
      54300.0
      96222.0
      0.0
      NaN
      NaN
      NaN
      1.0
      NaN
      NaN
      1987.0
      NaN
      NaN
      171518.0
      244880.0
      2015.0
      73362.0
      3048.74
      NaN
      NaN
      6.037296e+13
    
    
      4
      14432541
      -0.0050
      2016-01-02
      1
      NaN
      NaN
      NaN
      2.5
      4.0
      NaN
      NaN
      2.5
      NaN
      NaN
      2283.0
      2283.0
      NaN
      NaN
      NaN
      NaN
      6059.0
      NaN
      2.0
      2.0
      598.0
      NaN
      NaN
      33485643.0
      -117700234.0
      6000.0
      1.0
      NaN
      NaN
      NaN
      1.0
      122
      261.0
      NaN
      6.059042e+07
      17686.0
      1286.0
      NaN
      96961.0
      8.0
      NaN
      1.0
      NaN
      NaN
      NaN
      NaN
      1981.0
      2.0
      NaN
      169574.0
      434551.0
      2015.0
      264977.0
      5488.96
      NaN
      NaN
      6.059042e+13



In [236]:

    
pd.options.display.max_rows = 65
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ['Count', 'Column Type']
dtype_df









    Out[236]:







  
    
      
      Count
      Column Type
    
  
  
    
      0
      parcelid
      int64
    
    
      1
      logerror
      float64
    
    
      2
      transactiondate
      datetime64[ns]
    
    
      3
      transaction_month
      int64
    
    
      4
      airconditioningtypeid
      float64
    
    
      5
      architecturalstyletypeid
      float64
    
    
      6
      basementsqft
      float64
    
    
      7
      bathroomcnt
      float64
    
    
      8
      bedroomcnt
      float64
    
    
      9
      buildingclasstypeid
      float64
    
    
      10
      buildingqualitytypeid
      float64
    
    
      11
      calculatedbathnbr
      float64
    
    
      12
      decktypeid
      float64
    
    
      13
      finishedfloor1squarefeet
      float64
    
    
      14
      calculatedfinishedsquarefeet
      float64
    
    
      15
      finishedsquarefeet12
      float64
    
    
      16
      finishedsquarefeet13
      float64
    
    
      17
      finishedsquarefeet15
      float64
    
    
      18
      finishedsquarefeet50
      float64
    
    
      19
      finishedsquarefeet6
      float64
    
    
      20
      fips
      float64
    
    
      21
      fireplacecnt
      float64
    
    
      22
      fullbathcnt
      float64
    
    
      23
      garagecarcnt
      float64
    
    
      24
      garagetotalsqft
      float64
    
    
      25
      hashottuborspa
      object
    
    
      26
      heatingorsystemtypeid
      float64
    
    
      27
      latitude
      float64
    
    
      28
      longitude
      float64
    
    
      29
      lotsizesquarefeet
      float64
    
    
      30
      poolcnt
      float64
    
    
      31
      poolsizesum
      float64
    
    
      32
      pooltypeid10
      float64
    
    
      33
      pooltypeid2
      float64
    
    
      34
      pooltypeid7
      float64
    
    
      35
      propertycountylandusecode
      object
    
    
      36
      propertylandusetypeid
      float64
    
    
      37
      propertyzoningdesc
      object
    
    
      38
      rawcensustractandblock
      float64
    
    
      39
      regionidcity
      float64
    
    
      40
      regionidcounty
      float64
    
    
      41
      regionidneighborhood
      float64
    
    
      42
      regionidzip
      float64
    
    
      43
      roomcnt
      float64
    
    
      44
      storytypeid
      float64
    
    
      45
      threequarterbathnbr
      float64
    
    
      46
      typeconstructiontypeid
      float64
    
    
      47
      unitcnt
      float64
    
    
      48
      yardbuildingsqft17
      float64
    
    
      49
      yardbuildingsqft26
      float64
    
    
      50
      yearbuilt
      float64
    
    
      51
      numberofstories
      float64
    
    
      52
      fireplaceflag
      object
    
    
      53
      structuretaxvaluedollarcnt
      float64
    
    
      54
      taxvaluedollarcnt
      float64
    
    
      55
      assessmentyear
      float64
    
    
      56
      landtaxvaluedollarcnt
      float64
    
    
      57
      taxamount
      float64
    
    
      58
      taxdelinquencyflag
      object
    
    
      59
      taxdelinquencyyear
      float64
    
    
      60
      censustractandblock
      float64



In [237]:

    
dtype_df.groupby('Column Type').aggregate('count').reset_index()









    Out[237]:







  
    
      
      Column Type
      Count
    
  
  
    
      0
      int64
      2
    
    
      1
      float64
      53
    
    
      2
      datetime64[ns]
      1
    
    
      3
      object
      5



In [238]:

    
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0.99]









    Out[238]:







  
    
      
      column_name
      missing_count
      missing_ratio
    
  
  
    
      5
      architecturalstyletypeid
      90014
      0.997109
    
    
      6
      basementsqft
      90232
      0.999524
    
    
      9
      buildingclasstypeid
      90259
      0.999823
    
    
      12
      decktypeid
      89617
      0.992711
    
    
      16
      finishedsquarefeet13
      90242
      0.999634
    
    
      19
      finishedsquarefeet6
      89854
      0.995336
    
    
      44
      storytypeid
      90232
      0.999524
    
    
      46
      typeconstructiontypeid
      89976
      0.996688
    
    
      49
      yardbuildingsqft26
      90180
      0.998948
    
    
      52
      fireplaceflag
      90053
      0.997541



In [239]:

    
mean_values = train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values)



In [240]:

    
missing_df = train_df_new.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df_new.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']









    Out[240]:





35    propertycountylandusecode
37           propertyzoningdesc
58           taxdelinquencyflag
Name: column_name, dtype: object



In [241]:

    
print(train_df_new['taxdelinquencyflag'].value_counts())
train_df_new['taxdelinquencyflag'] = train_df_new['taxdelinquencyflag'] \
                .apply(lambda x: True if x == True or x == 'Y' else False)
print(train_df_new['taxdelinquencyflag'].value_counts())









    



Y    1783
Name: taxdelinquencyflag, dtype: int64
False    88492
True      1783
Name: taxdelinquencyflag, dtype: int64



In [242]:

    
train_df_new = train_df_new.fillna('UNKNOWN')



In [243]:

    
missing_df = train_df_new.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df_new.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']









    Out[243]:





Series([], Name: column_name, dtype: object)



In [244]:

    
x_cols = [col 
          for col in train_df_new.columns 
          if col not in ['logerror'] 
          if train_df_new[col].dtype=='float64']
labels = []
values = []
for col in x_cols:
    if np.std(train_df_new[col]) == 0.0:
        print('std is 0', col)
    else:
        labels.append(col)
        values.append(np.corrcoef(train_df_new[col].values, train_df_new.logerror.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')









    



std is 0 buildingclasstypeid
std is 0 decktypeid
std is 0 poolcnt
std is 0 pooltypeid10
std is 0 pooltypeid2
std is 0 pooltypeid7
std is 0 storytypeid
std is 0 assessmentyear



In [245]:

    
ind = np.arange(len(labels))
fig, ax = plt.subplots(figsize=(12, 40))
rects = ax.barh(ind, np.array(corr_df['corr_values'].values), color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df['col_labels'].values, rotation='horizontal')
plt.xlabel('Correlation coefficient')
plt.title('Correlation coefficient of the variables')
plt.show()



In [246]:

    
corr_df_sel = corr_df.loc[(corr_df['corr_values'] > 0.02) 
                          | (corr_df['corr_values'] < -0.01)]
display(corr_df_sel)









    







  
    
      
      col_labels
      corr_values
    
  
  
    
      41
      taxamount
      -0.014768
    
    
      19
      heatingorsystemtypeid
      -0.013732
    
    
      36
      yearbuilt
      0.021171
    
    
      4
      bedroomcnt
      0.032035
    
    
      16
      fullbathcnt
      0.034267
    
    
      6
      calculatedbathnbr
      0.036019
    
    
      3
      bathroomcnt
      0.036862
    
    
      8
      calculatedfinishedsquarefeet
      0.047659
    
    
      9
      finishedsquarefeet12
      0.048611



In [247]:

    
cols_to_use = corr_df_sel['col_labels'].tolist()

temp_df = train_df_new[cols_to_use]
corrmat = temp_df.corr(method='spearman')
f, ax = plt.subplots(figsize=(8, 8))

sns.heatmap(corrmat, vmax=1., square=True, cmap='coolwarm')
plt.title('Important variables correlation map', fontsize=15)
plt.show()

finishedsquarefeet12



In [248]:

    
col = 'finishedsquarefeet12'



In [249]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[249]:





<seaborn.axisgrid.JointGrid at 0x7f64514c1f28>



In [250]:

    
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)









    



upper limit : 5851.63 , lower limit : 560.0



In [251]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[251]:





<seaborn.axisgrid.JointGrid at 0x7f644dba3710>

calculatedfinishedsquarefeet



In [252]:

    
col = "calculatedfinishedsquarefeet"



In [253]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[253]:





<seaborn.axisgrid.JointGrid at 0x7f644de73e48>



In [254]:

    
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)









    



upper limit : 5946.26 , lower limit : 560.0



In [255]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[255]:





<seaborn.axisgrid.JointGrid at 0x7f644da665c0>

bathroomcnt



In [274]:

    
col = 'bathroomcnt'



In [275]:

    
draw_count_plot(train_df_new, col)
draw_box_plot(train_df_new, col, 'logerror')

bedroomcnt



In [276]:

    
col = 'bedroomcnt'



In [279]:

    
draw_count_plot(train_df_new, col)
draw_box_plot(train_df_new, col, 'logerror')
draw_violin_plot(train_df_new, col, 'logerror')

taxamount



In [280]:

    
col = "taxamount"



In [282]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[282]:





<seaborn.axisgrid.JointGrid at 0x7f644dbd1c50>



In [283]:

    
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)









    



upper limit : 41786.7296 , lower limit : 594.9285



In [284]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[284]:





<seaborn.axisgrid.JointGrid at 0x7f644c5208d0>

yearbuilt



In [285]:

    
col = 'yearbuilt'



In [286]:

    
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)









    












    Out[286]:





<seaborn.axisgrid.JointGrid at 0x7f64448ce630>



In [292]:

    
from ggplot import *
ggplot(aes(x='yearbuilt', y='logerror'), data=train_df_new) + \
    geom_point(color='steelblue', size=1)









    












    Out[292]:





<ggplot: (8754256894728)>



In [294]:

    
ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df_new) + \
    geom_point() + \
    scale_color_gradient(low = 'red', high = 'blue')









    












    Out[294]:





<ggplot: (-9223363282597945607)>



In [296]:

    
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df_new) + \
    geom_point(alpha=0.7) + \
    scale_color_gradient(low = 'pink', high = 'blue')









    












    Out[296]:





<ggplot: (-9223363282598127016)>

feature importance



In [303]:

    
train_y = train_df_new['logerror'].values

drop_cols = ["hashottuborspa", "propertycountylandusecode", "propertyzoningdesc", "fireplaceflag", "taxdelinquencyflag",
            'parcelid', 'logerror', 'transactiondate', 'transaction_month']
numeric_train_df = train_df_new.drop(drop_cols, axis=1)
feat_names = numeric_train_df.columns.values



In [308]:

    
from sklearn import ensemble
model = ensemble.ExtraTreesRegressor(n_estimators=25, max_depth=30, max_features=0.3, 
                                     n_jobs=-1, random_state=0)
model.fit(numeric_train_df, train_y)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]



In [309]:

    
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()



In [312]:

    
import xgboost as xgb
xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'silent': 1,
    'seed' : 0
}
dtrain = xgb.DMatrix(numeric_train_df, train_y, feature_names=feat_names)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)

# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, height=0.8, ax=ax)
plt.show()



In [259]:

    
print('hi')

hi

	parcelid	logerror	transactiondate
0	11016594	0.0276	2016-01-01
1	14366692	-0.1684	2016-01-01
2	12098116	-0.0040	2016-01-01
3	12643413	0.0218	2016-01-02
4	14432541	-0.0050	2016-01-02

	parcelid	airconditioningtypeid	architecturalstyletypeid	basementsqft	buildingclasstypeid	buildingqualitytypeid	calculatedbathnbr	decktypeid	finishedfloor1squarefeet	calculatedfinishedsquarefeet	finishedsquarefeet12	finishedsquarefeet13	finishedsquarefeet15	finishedsquarefeet50	finishedsquarefeet6	fips	fireplacecnt	fullbathcnt	garagecarcnt	garagetotalsqft	hashottuborspa	heatingorsystemtypeid	latitude	longitude	lotsizesquarefeet	poolcnt	poolsizesum	pooltypeid10	pooltypeid2	pooltypeid7	propertycountylandusecode	propertylandusetypeid	propertyzoningdesc	rawcensustractandblock	regionidcity	regionidcounty	regionidneighborhood	regionidzip	storytypeid	threequarterbathnbr	typeconstructiontypeid	unitcnt	yardbuildingsqft17	yardbuildingsqft26	yearbuilt	numberofstories	fireplaceflag	structuretaxvaluedollarcnt	taxvaluedollarcnt	assessmentyear	landtaxvaluedollarcnt	taxamount	taxdelinquencyflag	taxdelinquencyyear	censustractandblock
0	10754147	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6037.0	NaN	NaN	NaN	NaN	NaN	NaN	34144442.0	-118654084.0	85768.0	NaN	NaN	NaN	NaN	NaN	010D	269.0	NaN	6.037800e+07	37688.0	3101.0	NaN	96337.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	9.0	2015.0	9.0	NaN	NaN	NaN	NaN
1	10759547	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6037.0	NaN	NaN	NaN	NaN	NaN	NaN	34140430.0	-118625364.0	4083.0	NaN	NaN	NaN	NaN	NaN	0109	261.0	LCA11*	6.037800e+07	37688.0	3101.0	NaN	96337.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	27516.0	2015.0	27516.0	NaN	NaN	NaN	NaN
2	10843547	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	73026.0	NaN	NaN	73026.0	NaN	NaN	6037.0	NaN	NaN	NaN	NaN	NaN	NaN	33989359.0	-118394633.0	63085.0	NaN	NaN	NaN	NaN	NaN	1200	47.0	LAC2	6.037703e+07	51617.0	3101.0	NaN	96095.0	NaN	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN	650756.0	1413387.0	2015.0	762631.0	20800.37	NaN	NaN	NaN
3	10859147	NaN	NaN	NaN	3.0	7.0	NaN	NaN	NaN	5068.0	NaN	NaN	5068.0	NaN	NaN	6037.0	NaN	NaN	NaN	NaN	NaN	NaN	34148863.0	-118437206.0	7521.0	NaN	NaN	NaN	NaN	NaN	1200	47.0	LAC2	6.037141e+07	12447.0	3101.0	27080.0	96424.0	NaN	NaN	NaN	NaN	NaN	NaN	1948.0	1.0	NaN	571346.0	1156834.0	2015.0	585488.0	14557.57	NaN	NaN	NaN
4	10879947	NaN	NaN	NaN	4.0	NaN	NaN	NaN	NaN	1776.0	NaN	NaN	1776.0	NaN	NaN	6037.0	NaN	NaN	NaN	NaN	NaN	NaN	34194168.0	-118385816.0	8512.0	NaN	NaN	NaN	NaN	NaN	1210	31.0	LAM1	6.037123e+07	12447.0	3101.0	46795.0	96450.0	NaN	NaN	NaN	1.0	NaN	NaN	1947.0	NaN	NaN	193796.0	433491.0	2015.0	239695.0	5725.17	NaN	NaN	NaN

	column_name	missing_count
35	rawcensustractandblock	11437
25	longitude	11437
24	latitude	11437
37	regionidcounty	11437
17	fips	11437
33	propertylandusetypeid	11437
52	assessmentyear	11439
5	bedroomcnt	11450
4	bathroomcnt	11462
40	roomcnt	11475
32	propertycountylandusecode	12277
39	regionidzip	13980
54	taxamount	31250
51	taxvaluedollarcnt	42550
50	structuretaxvaluedollarcnt	54982
11	calculatedfinishedsquarefeet	55565
47	yearbuilt	59928
36	regionidcity	62845
53	landtaxvaluedollarcnt	67733
57	censustractandblock	75126
19	fullbathcnt	128912
8	calculatedbathnbr	128912
12	finishedsquarefeet12	276033
26	lotsizesquarefeet	276099
34	propertyzoningdesc	1006588
44	unitcnt	1007727
7	buildingqualitytypeid	1046729
23	heatingorsystemtypeid	1178816
38	regionidneighborhood	1828815
21	garagetotalsqft	2101950
20	garagecarcnt	2101950
1	airconditioningtypeid	2173698
48	numberofstories	2303148
27	poolcnt	2467683
31	pooltypeid7	2499758
18	fireplacecnt	2672580
42	threequarterbathnbr	2673586
10	finishedfloor1squarefeet	2782500
15	finishedsquarefeet50	2782500
14	finishedsquarefeet15	2794419
45	yardbuildingsqft17	2904862
22	hashottuborspa	2916203
56	taxdelinquencyyear	2928753
55	taxdelinquencyflag	2928755
29	pooltypeid10	2948278
30	pooltypeid2	2953142
28	poolsizesum	2957257
16	finishedsquarefeet6	2963216
9	decktypeid	2968121
6	buildingclasstypeid	2972588
13	finishedsquarefeet13	2977545
43	typeconstructiontypeid	2978470
2	architecturalstyletypeid	2979156
49	fireplaceflag	2980054
46	yardbuildingsqft26	2982570
3	basementsqft	2983589
41	storytypeid	2983593

	Count	Column Type
0	parcelid	int64
1	logerror	float64
2	transactiondate	datetime64[ns]
3	transaction_month	int64
4	airconditioningtypeid	float64
5	architecturalstyletypeid	float64
6	basementsqft	float64
7	bathroomcnt	float64
8	bedroomcnt	float64
9	buildingclasstypeid	float64
10	buildingqualitytypeid	float64
11	calculatedbathnbr	float64
12	decktypeid	float64
13	finishedfloor1squarefeet	float64
14	calculatedfinishedsquarefeet	float64
15	finishedsquarefeet12	float64
16	finishedsquarefeet13	float64
17	finishedsquarefeet15	float64
18	finishedsquarefeet50	float64
19	finishedsquarefeet6	float64
20	fips	float64
21	fireplacecnt	float64
22	fullbathcnt	float64
23	garagecarcnt	float64
24	garagetotalsqft	float64
25	hashottuborspa	object
26	heatingorsystemtypeid	float64
27	latitude	float64
28	longitude	float64
29	lotsizesquarefeet	float64
30	poolcnt	float64
31	poolsizesum	float64
32	pooltypeid10	float64
33	pooltypeid2	float64
34	pooltypeid7	float64
35	propertycountylandusecode	object
36	propertylandusetypeid	float64
37	propertyzoningdesc	object
38	rawcensustractandblock	float64
39	regionidcity	float64
40	regionidcounty	float64
41	regionidneighborhood	float64
42	regionidzip	float64
43	roomcnt	float64
44	storytypeid	float64
45	threequarterbathnbr	float64
46	typeconstructiontypeid	float64
47	unitcnt	float64
48	yardbuildingsqft17	float64
49	yardbuildingsqft26	float64
50	yearbuilt	float64
51	numberofstories	float64
52	fireplaceflag	object
53	structuretaxvaluedollarcnt	float64
54	taxvaluedollarcnt	float64
55	assessmentyear	float64
56	landtaxvaluedollarcnt	float64
57	taxamount	float64
58	taxdelinquencyflag	object
59	taxdelinquencyyear	float64
60	censustractandblock	float64

	column_name	missing_count	missing_ratio
5	architecturalstyletypeid	90014	0.997109
6	basementsqft	90232	0.999524
9	buildingclasstypeid	90259	0.999823
12	decktypeid	89617	0.992711
16	finishedsquarefeet13	90242	0.999634
19	finishedsquarefeet6	89854	0.995336
44	storytypeid	90232	0.999524
46	typeconstructiontypeid	89976	0.996688
49	yardbuildingsqft26	90180	0.998948
52	fireplaceflag	90053	0.997541

	col_labels	corr_values
41	taxamount	-0.014768
19	heatingorsystemtypeid	-0.013732
36	yearbuilt	0.021171
4	bedroomcnt	0.032035
16	fullbathcnt	0.034267
6	calculatedbathnbr	0.036019
3	bathroomcnt	0.036862
8	calculatedfinishedsquarefeet	0.047659
9	finishedsquarefeet12	0.048611